The Office of Advocacy’s Small Business Profiles are an annual analysis of each state’s small business activities. Each profile gathers the latest information from key federal data-gathering agencies to provide a snapshot of small business health and economic activity. This year’s profiles report on state economic growth and employment; small business employment, industry composition, and turnover; plus business owner demographics and county-level employment change.
from IPython.core.display import display, HTML
display(HTML("""<style> .container {width:96% !important;}</style>"""))
from IPython.display import IFrame
import pandas as pd
import multiprocessing
import numpy as np
from multiprocessing.dummy import Pool as ThreadPool
from functools import partial
import math
# Handle s3 or local
import s3fs
from os import listdir
from os.path import isfile, join
import subprocess
This Dataset from the U.S. Small Business Administration (SBA) can be download from this website
https://www.sba.gov/advocacy/small-business-profiles-states-and-territories-2016
Assess the pros and cons of the most popular libraries to read pdf's
import sys
sys.path.insert(0,'../')
from Tools.paths import *
def list_files(path,ext = 'pdf'):
if path.startswith('s3://'):
onlyfiles = subprocess.check_output(['aws', 's3', 'ls', path_s3])
onlyfiles = onlyfiles.split('\n')
onlyfiles = [f.split(" ")[-1] for f in onlyfiles]
else:
onlyfiles = [f for f in listdir(path_local) if isfile(join(path_local, f))]
onlyfiles = [f for f in onlyfiles if f.endswith('.{}'.format(ext))]
files = [f.replace('.{}'.format(ext),'') for f in onlyfiles]
return files
def path(path,name,ext = 'pdf'):
path_file = '{}{}.{}'.format(path,name,ext)
return path_file
import PyPDF2
def load_pdf(path_file):
def get_content(fp_in):
content = []
pdf = PyPDF2.PdfFileReader(fp_in)
number_of_pages = pdf.getNumPages()
for i in xrange(number_of_pages):
page = pdf.getPage(i).extractText().split()
content.append(page)
return content
if path_file.startswith('s3://'):
fs = s3fs.S3FileSystem()
with fs.open(path_file, 'rb') as fp_in:
content = get_content(fp_in)
else:
fp_in = file(path_file,'rb')
content = get_content(fp_in)
return content
%%time
files = list_files(path_s3)[1]
path_file = path(path_s3,files)
file_pdf = load_pdf(path_file)
for fp in file_pdf:
print fp
print '\n'
file_pdf[2]
import tabula
def load_pdf(path_file):
if path_file.startswith('s3://'):
fs = s3fs.S3FileSystem()
with fs.open(path_file, 'rb') as fp_in:
pdf = tabula.read_pdf(fp_in,multiple_tables=True)
else:
pdf = tabula.read_pdf(path_file,multiple_tables=True)
return pdf
# tabula.read_pdf(file_path,multiple_tables=True, pages = 3)
%%time
files = list_files(path_local)[1]
path_file = path(path_local,files)
file_pdf = load_pdf(path_file)
file_pdf
import pdfquery
def load_pdf(path_file):
if path_file.startswith('s3://'):
fs = s3fs.S3FileSystem()
with fs.open(path_file, 'rb') as fp_in:
pdf = pdfquery.PDFQuery(fp_in)
pdf.load()
else:
pdf = pdfquery.PDFQuery(path_file)
pdf.load()
return pdf
%%time
files = list_files(path_local)[1]
path_file = path(path_local,files)
file_pdf = load_pdf(path_file)
def getCoordinates(pdf,query, type_search = "Line"):
name = pdf.pq('LTText%sHorizontal:contains("%s")' % (type_search,query))
for n in name:
d = dict()
d["left_corner"] = math.floor(float(n.layout.x0)* 1000)/1000.0
d["bottom_corner"] = math.floor(float(n.layout.y0)* 1000)/1000.0
d["right_corner"] = math.ceil(float(n.layout.x1)* 1000)/1000.0
d["upper_corner"] = math.ceil(float(n.layout.y1)* 1000)/1000.0
d["text"] = n.layout.get_text()
d["pageid"] = int(float(n.iterancestors('LTPage').next().layout.pageid))
yield d
g = getCoordinates(file_pdf,'Small Businesses', type_search='Line')
d = next(g,None)
d
file_pdf.pq(('LTPage[pageid="%s"] LTTextBoxHorizontal:overlaps_bbox("%f,%f,%f,%f")' % (d['pageid'],
d['left_corner'],
d['bottom_corner'],
d['right_corner'],
d['upper_corner']))).text()
left_corner = 0
file_pdf.pq(('LTPage[pageid="%s"] LTTextBoxHorizontal:overlaps_bbox("%f,%f,%f,%f")' % (d['pageid'],
left_corner,
d['bottom_corner'],
d['right_corner'],
d['upper_corner']))).text()
KeyFigures = ['EMPLOYMENT',
'DIVERSITY',
'TRADE']
delta_bottom = 30
Info = [('with_formatter', 'text')]
for kf in KeyFigures:
g = getCoordinates(pdf=file_pdf,query=kf,type_search="Box")
d = next(g,None)
Info.append(tuple((kf,'LTPage[pageid="%s"] LTTextBoxHorizontal:overlaps_bbox("%f,%f,%f,%f")'%(d['pageid'],
d["left_corner"],
d["bottom_corner"]-delta_bottom,
d["right_corner"],
d["upper_corner"]))))
info = file_pdf.extract(Info)
info
def info1(file_pdf):
col_right_align = 300
DemographicGroup = ['American-owned',
'Asian-owned',
'Islander-owned',
'Hispanic-owned',
'Alaskan-owned',
'Minority-owned',
'Nonminority-owned']
DemographicInfo = [('with_formatter', 'text')]
for dg in DemographicGroup:
g = getCoordinates(pdf=file_pdf,query=dg,type_search="Line")
d = next(g,None)
DemographicInfo.append(tuple((dg,'LTTextLineHorizontal:in_bbox("%f,%f,%f,%f")'%(d["left_corner"],
d["bottom_corner"],
col_right_align,
d["upper_corner"]))))
info = file_pdf.extract(DemographicInfo)
return info
info1(file_pdf)
def getTable(file_pdf, col_width, row_space, row_height,title,bottom_corner_dif,headers,col_left_align):
table = list()
table.append(headers)
g = getCoordinates(pdf=file_pdf,query=title,type_search="Line")
d = next(g,None)
pageid = d['pageid']
bottom_corner = d['bottom_corner'] - bottom_corner_dif
while 1:
columns = (c for c in xrange(len(headers)))
boxes = list()
for c in columns:
boxes.append(tuple(('col_%s' %(c),
'LTPage[pageid="%s"] LTTextLineHorizontal:overlaps_bbox("%f,%f,%f,%f")' % (pageid,
col_left_align[c],
bottom_corner,
col_left_align[c]+col_width,
bottom_corner+row_height))))
columns = [c for c in xrange(len(headers))]
row = file_pdf.extract(boxes)
columns = [row['col_{}'.format(c)].text() for c in columns]
table.append(columns)
if 'Total' in row['col_0'].text():
break
bottom_corner -= row_space
return table
def info2(file_pdf):
col_width = 35
col_left_align = [50,295,371,449,532]
row_space = 16.78
row_height = 14
bottom_corner_dif = 126.91
headers = ['Industry',
'1-499 Employees',
'1-19 Employees',
'Nonemployer Firms',
'Total Small Firms']
table = getTable(col_left_align=col_left_align,
col_width=col_width,
file_pdf=file_pdf,
headers=headers,
row_height=row_height,
row_space = row_space,
bottom_corner_dif=bottom_corner_dif,
title = "Table 1")
return table
info2(file_pdf)
def info3(file_pdf):
col_width = 35
col_left_align = [50,325,400,532]
row_space = 13.5
row_height = 12.4
bottom_corner_dif = 115.5
headers = ['Industry',
'Small Business Employment',
'Total Private Employment',
'Small Business Emp Share']
table = getTable(col_left_align=col_left_align,
col_width=col_width,
file_pdf=file_pdf,
headers=headers,
row_height=row_height,
row_space = row_space,
bottom_corner_dif=bottom_corner_dif,
title = "Table 2"
)
return table
info3(file_pdf)
def process_file(path_file):
file_pdf = load_pdf(path_file)
d = dict()
d['file'] = path_file
d.update(info1(file_pdf))
x = info2(file_pdf)
d['industry'] = x
x = info3(file_pdf)
d['employment'] = x
return d
# https://stackoverflow.com/questions/29494001/how-can-i-abort-a-task-in-a-multiprocessing-pool-after-a-timeout
def abortable_worker(func, *args, **kwargs):
timeout = kwargs.get('timeout', None)
p = ThreadPool(1)
res = p.apply_async(func, args=args)
try:
out = res.get(timeout) # Wait timeout seconds for func to complete.
return out
except multiprocessing.TimeoutError:
print("Aborting due to timeout ")
p.terminate()
raise
if __name__ == '__main__':
result = list()
pool = multiprocessing.Pool(maxtasksperchild=1)
files = list_files(path_s3)
files = files[0:4]
for i in files:
print i
abortable_func = partial(abortable_worker, process_file, timeout=60)
path_file = path(path_s3,i)
pool.apply_async(abortable_func, args=(path_file, ), callback=result.append)
pool.close()
pool.join()
result
## Test